

# -- start out by loading required libraries

library(lme4)
library(ggplot2)
library(gridExtra)
library(psych)
library(dplyr)
library(xtable)
library(broom)
library(tidyr)
library(reshape2)
library(rms)
library(languageR)
library(MASS)


# -- start out by analysing language skills effects in the brazilian dataset


# -- read in the downloaded dataset

getwd()
setwd("/Users/robdavies/Dropbox/project epilepsy eye tracking 290416")


# -- read in the epilepsy dataframe

epilepsy <- read.csv(file="VelocityDataForRob.csv",head=TRUE,sep=",")


# -- inspect the dataset

summary(epilepsy)

# > summary(epilepsy)
# UID            Task        TRIAL_INDEX      MedsNotTD         Group            Partno         SACType     
# Min.   :1020   Min.   :1.000   Min.   : 1.00   Min.   :0.000   Min.   :0.0000   Min.   :102.0   Min.   :1.000  
# 1st Qu.:1561   1st Qu.:1.000   1st Qu.: 8.00   1st Qu.:1.000   1st Qu.:0.0000   1st Qu.:156.0   1st Qu.:1.000  
# Median :2150   Median :1.000   Median :15.00   Median :2.000   Median :0.0000   Median :215.0   Median :1.000  
# Mean   :2010   Mean   :1.478   Mean   :15.42   Mean   :1.542   Mean   :0.3219   Mean   :200.9   Mean   :1.671  
# 3rd Qu.:2351   3rd Qu.:2.000   3rd Qu.:23.00   3rd Qu.:2.000   3rd Qu.:1.0000   3rd Qu.:235.0   3rd Qu.:2.000  
# Max.   :2651   Max.   :2.000   Max.   :30.00   Max.   :2.000   Max.   :1.0000   Max.   :265.0   Max.   :3.000  
# CURRENT_SAC_PEAK_VELOCITY CURRENT_SAC_AMPLITUDE
# Min.   : 53.0             Min.   : 1.04        
# 1st Qu.:273.7             1st Qu.: 5.73        
# Median :323.5             Median : 7.05        
# Mean   :330.0             Mean   : 7.22        
# 3rd Qu.:382.4             3rd Qu.: 8.11        
# Max.   :864.4             Max.   :18.44


# -- when we read the data in we can see that numeric variables used to code subject ID or condition are recognized as numbers
# not factors (as they should be) so coerce these variables so that they are treated as factors, as required

epilepsy$UID <- as.factor(epilepsy$UID)
epilepsy$Task <- as.factor(epilepsy$Task)
epilepsy$MedsNotTD <- as.factor(epilepsy$MedsNotTD)
epilepsy$Group <- as.factor(epilepsy$Group)
epilepsy$Partno <- as.factor(epilepsy$Partno)
epilepsy$SACType <- as.factor(epilepsy$SACType)

# -- inspect the results by looking at a summary of the dataset again

summary(epilepsy)

# > summary(epilepsy)
# UID       Task      TRIAL_INDEX    MedsNotTD Group        Partno     SACType  CURRENT_SAC_PEAK_VELOCITY CURRENT_SAC_AMPLITUDE
# 2011   :  47   1:1913   Min.   : 1.00   0: 499    0:2484   123    :  60   1:1913   Min.   : 53.0             Min.   : 1.04        
# 1591   :  46   2:1750   1st Qu.: 8.00   1: 680    1:1179   216    :  60   2:1041   1st Qu.:273.7             1st Qu.: 5.73        
# 2171   :  46            Median :15.00   2:2484             220    :  60   3: 709   Median :323.5             Median : 7.05        
# 1561   :  45            Mean   :15.42                      221    :  60            Mean   :330.0             Mean   : 7.22        
# 2311   :  44            3rd Qu.:23.00                      156    :  59            3rd Qu.:382.4             3rd Qu.: 8.11        
# 2101   :  43            Max.   :30.00                      217    :  59            Max.   :864.4             Max.   :18.44        
# (Other):3392                                               (Other):3305                                                  


# -- what do we need to do?

# -- reviewer comments: "1) As pointed out by the authors saccadic peak velocity depends mainly on the saccade amplitude. Although there is no visible 
# target in the antisaccade task peak eye velocity depends on the amplitude. Usually the main sequence of antisaccades is lower compared to prosaccades. 
# Nevertheless it is essential not to use peak velocity as is [33] but to calculate peak velocity from a suitable fit. The main sequence of saccades in 
# the range from 0 to 10° is nearly linear, above 10° it follows a non-linear function. The authors should calculate for each subject and each condition 
# the peak velocity from fit function parameters, e.g. a robust linear fit function of saccade amplitude vs. peak velocity. From the fit function peak 
# velocity can be calculated for an appropriate amplitude, let’s say an 8° amplitude. This calculation can be done easily using appropriate software, 
# e.g. Matlab. I would have performed these analyses but unfortunately the dataset is not available in the Harvard Dataverse database despite of the 
# statements in the header of the manuscript!"

# -- the key instruction is:
#   The authors should calculate for each subject and each condition 
# # the peak velocity from fit function parameters, e.g. a robust linear fit function of saccade amplitude vs. peak velocity. From the fit function peak 
# # velocity can be calculated for an appropriate amplitude, let’s say an 8° amplitude.


# -- for all data, what is the relationship of saccade amplitude vs. peak velocity?


# -- first, for all data, the OLS coefficient:

summary(lm(CURRENT_SAC_PEAK_VELOCITY ~ CURRENT_SAC_AMPLITUDE, data = epilepsy))

# > summary(lm(CURRENT_SAC_PEAK_VELOCITY ~ CURRENT_SAC_AMPLITUDE, data = epilepsy))
# 
# Call:
#   lm(formula = CURRENT_SAC_PEAK_VELOCITY ~ CURRENT_SAC_AMPLITUDE, 
#      data = epilepsy)
# 
# Residuals:
#   Min      1Q  Median      3Q     Max 
# -259.47  -48.92   -7.33   40.81  636.00 
# 
# Coefficients:
#   Estimate Std. Error t value Pr(>|t|)    
# (Intercept)           203.5317     3.6474   55.80   <2e-16 ***
#   CURRENT_SAC_AMPLITUDE  17.5138     0.4752   36.85   <2e-16 ***
#   ---
#   Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# 
# Residual standard error: 74.94 on 3661 degrees of freedom
# Multiple R-squared:  0.2706,  Adjusted R-squared:  0.2704 
# F-statistic:  1358 on 1 and 3661 DF,  p-value: < 2.2e-16


# -- what does this look like?

pdf("peakvelocity per condition 050516.pdf", w = 10, h = 10)

plotpeak <- ggplot(data = epilepsy, aes(x = CURRENT_SAC_AMPLITUDE, y = CURRENT_SAC_PEAK_VELOCITY))
plotpeak <- plotpeak + geom_point(alpha = .5) + geom_smooth(size = 2) + geom_vline(x = 8, size = 2, colour = "red") + theme_bw()
plotpeak <- plotpeak + facet_wrap( ~ SACType)
plotpeak

dev.off()


# -- reviewer requires robust regression:
#   http://www.ats.ucla.edu/stat/r/dae/rreg.htm
# -- where note that:
# "Robust regression is done by iterated re-weighted least squares (IRLS). The command for running robust regression is rlm in 
# the MASS package. There are several weighting functions that can be used for IRLS. We are going to first use the Huber weights 
# in this example."  
# -- looks like the default for rlm() is to use Huber weighting function, whatever that is

summary(rlm(CURRENT_SAC_PEAK_VELOCITY ~ CURRENT_SAC_AMPLITUDE, data = epilepsy))

# > summary(rlm(CURRENT_SAC_PEAK_VELOCITY ~ CURRENT_SAC_AMPLITUDE, data = epilepsy))
# 
# Call: rlm(formula = CURRENT_SAC_PEAK_VELOCITY ~ CURRENT_SAC_AMPLITUDE, 
#           data = epilepsy)
# Residuals:
#   Min       1Q   Median       3Q      Max 
# -254.641  -44.619   -3.592   45.049  644.881 
# 
# Coefficients:
#   Value    Std. Error t value 
# (Intercept)           193.4497   3.4665    55.8058
# CURRENT_SAC_AMPLITUDE  18.3587   0.4516    40.6492
# 
# Residual standard error: 66.4 on 3661 degrees of freedom


# -- check alternative weighting function:

summary(rlm(CURRENT_SAC_PEAK_VELOCITY ~ CURRENT_SAC_AMPLITUDE, data = epilepsy, psi = psi.bisquare))

# > summary(rlm(CURRENT_SAC_PEAK_VELOCITY ~ CURRENT_SAC_AMPLITUDE, data = epilepsy, psi = psi.bisquare))
# 
# Call: rlm(formula = CURRENT_SAC_PEAK_VELOCITY ~ CURRENT_SAC_AMPLITUDE, 
#           data = epilepsy, psi = psi.bisquare)
# Residuals:
#   Min       1Q   Median       3Q      Max 
# -253.776  -43.784   -2.695   45.899  646.023 
# 
# Coefficients:
#   Value    Std. Error t value 
# (Intercept)           192.2259   3.4195    56.2140
# CURRENT_SAC_AMPLITUDE  18.4165   0.4455    41.3369
# 
# Residual standard error: 66.4 on 3661 degrees of freedom


# -- evidently not much difference in coefficients if compare coefficients between results of rlm analyses with different
# weighting functions, though there does seem to be a small difference if compare results of lm vs rlm analyses


# -- require perform rlm analysis (use default) for each subject and condition, analysed separately
# -- then use the estimated coefficient of the effect of amplitude peak velocity to estimate peak velocity at 8 degrees


# -- the following done with lm() as a first pass:-
# -- following that, tried to do the same with rlm() -- but that commented out because it failed


# -- start by plotting the data facetted by subjects and conditions:-
# -- what does this look like?

pdf("peakvelocity per subject and condition 050516.pdf", w = 20, h = 20)

plotpeak <- ggplot(data = epilepsy, aes(x = CURRENT_SAC_AMPLITUDE, y = CURRENT_SAC_PEAK_VELOCITY))
plotpeak <- plotpeak + geom_point() + geom_smooth() + geom_vline(x = 8, colour = "red") + theme_bw() + ylim(0, 2500)
plotpeak <- plotpeak + facet_wrap(Partno ~ SACType)
plotpeak

dev.off()


# -- complete a *linear model* for the relationship, separately for each condition (3 conditions, coded 1-3) for each participant
# -- see:
#   https://cran.rstudio.com/web/packages/dplyr/vignettes/introduction.html  
# -- where: "the group_by() function. It breaks down a dataset into specified groups of rows. When you then apply the verbs above 
# on the resulting object they’ll be automatically applied “by group”."  
# -- so creating a new dataframe epilepsylm by applying lm(CURRENT_SAC_PEAK_VELOCITY ~ CURRENT_SAC_AMPLITUDE,...) to
# each subject and condition, and extracting the intercept and coefficient for each subject and condition

epilepsylm <- epilepsy %>% group_by(Partno, SACType) %>% do(tidy(lm(CURRENT_SAC_PEAK_VELOCITY ~ CURRENT_SAC_AMPLITUDE, data=.)))
epilepsylm$term <- as.factor(epilepsylm$term)

head(epilepsylm, n = 10)
str(epilepsylm)
summary(epilepsylm)

# -- see that have approx 6 x 74 rows: for each of 74 subjects x 3 conditions, one row for the intercept and one for the coefficient of the
# amplitude effect on peak velocity


# -- notice that for some subjects only have data rows for some conditions

# -- can plot estimated coefficient of amplitude on velocity effect, with standard errors


# -- plot the intercepts and slopes by class, using values from the per-class lm fits

epilepsylm.int <- filter(epilepsylm, term == '(Intercept)')
epilepsylm.effect <- filter(epilepsylm, term == 'CURRENT_SAC_AMPLITUDE')

ppeaklm.int <- ggplot(epilepsylm.int, aes(x = Partno, y = estimate, ymin = estimate - std.error, ymax = estimate + std.error))
ppeaklm.int <- ppeaklm.int + geom_point() + geom_linerange() + facet_wrap(~SACType)
ppeaklm.int

ppeaklm.effect <- ggplot(epilepsylm.effect, aes(x = Partno, y = estimate, ymin = estimate - std.error, ymax = estimate + std.error))
ppeaklm.effect <- ppeaklm.effect + geom_point() + geom_linerange() + facet_wrap(~SACType)
ppeaklm.effect


# -- think that need to calculate peak velocity at an amplitude value of 8
# -- can do this easily if intercept and amplitude coefficient are in columns

# -- need to reshape the dataframe:-

# -- we just need the Partno, term and estimate variables

epilepsylm.cast <- epilepsylm[, c("Partno", "SACType", "term", "estimate")]
summary(epilepsylm.cast)


# -- it will also help later if we rename the resulting intercept column to get rid of the brackets

epilepsylm.cast.2 <- within(epilepsylm.cast, {
  
  newterm <- NA
  newterm[term == "(Intercept)"] <- "Intercept"
  newterm[term == "CURRENT_SAC_AMPLITUDE"] <- "CURRENT_SAC_AMPLITUDE"
  
})

epilepsylm.cast.2$newterm <- as.factor(epilepsylm.cast.2$newterm)

# -- get rid of the old term
epilepsylm.cast.3 <- epilepsylm.cast.2[, c("Partno", "SACType", "newterm", "estimate")]

summary(epilepsylm.cast.3)


# -- then rearrange the dataset so that intercept and amplitude effect coefficients are in separate columns for each subject
# -- notice that require use dcast() function (not cast()), see ??dcast help:
# https://cran.r-project.org/web/packages/reshape2/reshape2.pdf
# -- in the dcast() funcation call, "Partno + SACType ~ newterm" the terms on the left of the ~ define the rows, on the right
# define the columns

epilepsylm.cast.4 <- dcast(epilepsylm.cast.3, Partno + SACType ~ newterm)
summary(epilepsylm.cast.4)

# > summary(epilepsylm.cast.4)
# Partno    SACType CURRENT_SAC_AMPLITUDE   Intercept     
# 102    :  3   1:73    Min.   :-31.04        Min.   :-509.0  
# 114    :  3   2:69    1st Qu.: 14.83        1st Qu.: 107.9  
# 117    :  3   3:69    Median : 23.81        Median : 156.2  
# 119    :  3           Mean   : 24.19        Mean   : 175.1  
# 123    :  3           3rd Qu.: 31.29        3rd Qu.: 225.5  
# 129    :  3           Max.   :219.78        Max.   : 759.0  
# (Other):193           NA's   :8 


# -- we can then "predict" the peak velocity value for an amplitude of 8:

epilepsylm.cast.4$peakat8 <- epilepsylm.cast.4$Intercept + (epilepsylm.cast.4$CURRENT_SAC_AMPLITUDE*8)

summary(epilepsylm.cast.4)


# -- write the file out as a csv:-

write.csv(epilepsylm.cast.4, file = "epilepsy-estimated-peak-velocity-at-8-degrees-050516.csv", row.names = FALSE)



####################################################################################################################################


# -- in the following, have code for trying the same estimation of peak velocity per subject and condition at 8 degrees, given estimates 
# of intercepts and coefficient of effect of amplitude on peak velocity, 
# -- the code did not work as rlm would not run


# -- try the following but see the error:
#   > epilepsyrlm <- epilepsy %>% group_by(Partno, SACType) %>% do(tidy(rlm(CURRENT_SAC_PEAK_VELOCITY ~ CURRENT_SAC_AMPLITUDE, data=.)))
# Error in rlm.default(x, y, weights, method = method, wt.method = wt.method,  : 
#                        'x' is singular: singular fits are not implemented in 'rlm'
#                      In addition: Warning message:
#                        In rlm.default(x, y, weights, method = method, wt.method = wt.method,  :
#                                         'rlm' failed to converge in 20 steps
#                                       > epilepsyrlm$term <- as.factor(epilepsyrlm$term)
#                                       Error in is.factor(x) : object 'epilepsyrlm' not found  
 

# -- this may be because there are missing data for some subjects
# -- when look at peakvelocity plot per subject and condition can see that for some subjects there are too few data
# in each condition to fit models


# # -- complete a *robust linear model* for the relationship, separately for each condition (3 conditions, coded 1-3) for each participant
# # -- see:
# #   https://cran.rstudio.com/web/packages/dplyr/vignettes/introduction.html  
# # -- where: "the group_by() function. It breaks down a dataset into specified groups of rows. When you then apply the verbs above 
# # on the resulting object they’ll be automatically applied “by group”."  
# # -- so creating a new dataframe epilepsylm by applying lm(CURRENT_SAC_PEAK_VELOCITY ~ CURRENT_SAC_AMPLITUDE,...) to
# # each subject and condition, and extracting the intercept and coefficient for each subject and condition
# 
# epilepsyrlm <- epilepsy %>% group_by(Partno, SACType) %>% do(tidy(rlm(CURRENT_SAC_PEAK_VELOCITY ~ CURRENT_SAC_AMPLITUDE, data=.)))
# epilepsyrlm$term <- as.factor(epilepsyrlm$term)
# 
# head(epilepsyrlm, n = 10)
# str(epilepsyrlm)
# summary(epilepsyrlm)
# 
# # -- see that have approx 6 x 74 rows: for each of 74 subjects x 3 conditions, one row for the intercept and one for the coefficient of the
# # amplitude effect on peak velocity
# 
# 
# # -- notice that for some subjects only have data rows for some conditions
# 
# 
# # -- think that need to calculate peak velocity at an amplitude value of 8
# # -- can do this easily if intercept and amplitude coefficient are in columns
# 
# # -- need to reshape the dataframe:-
# 
# # -- we just need the Partno, term and estimate variables
# 
# epilepsyrlm.cast <- epilepsyrlm[, c("Partno", "SACType", "term", "estimate")]
# summary(epilepsyrlm.cast)
# 
# 
# # -- it will also help later if we rename the resulting intercept column to get rid of the brackets
# 
# epilepsyrlm.cast.2 <- within(epilepsyrlm.cast, {
#   
#   newterm <- NA
#   newterm[term == "(Intercept)"] <- "Intercept"
#   newterm[term == "CURRENT_SAC_AMPLITUDE"] <- "CURRENT_SAC_AMPLITUDE"
#   
# })
# 
# epilepsyrlm.cast.2$newterm <- as.factor(epilepsyrlm.cast.2$newterm)
# 
# # -- get rid of the old term
# epilepsyrlm.cast.3 <- epilepsyrlm.cast.2[, c("Partno", "SACType", "newterm", "estimate")]
# 
# summary(epilepsyrlm.cast.3)
# 
# 
# # -- then rearrange the dataset so that intercept and amplitude effect coefficients are in separate columns for each subject
# # -- notice that require use dcast() function (not cast()), see ??dcast help:
# # https://cran.r-project.org/web/packages/reshape2/reshape2.pdf
# # -- in the dcast() funcation call, "Partno + SACType ~ newterm" the terms on the left of the ~ define the rows, on the right
# # define the columns
# 
# epilepsyrlm.cast.4 <- dcast(epilepsyrlm.cast.3, Partno + SACType ~ newterm)
# 
# 
# # -- we can then "predict" the peak velocity value for an amplitude of 8:
# 
# epilepsyrlm.cast.4$peakat8 <- epilepsyrlm.cast.4$Intercept + (epilepsyrlm.cast.4$CURRENT_SAC_AMPLITUDE*8)








